Goal: Using the collected from existing customers, build a model that will help the marketing team identity potential customers who are relatively more likely to subscribe term deposit and thus increase their hit ratio.
Resources Avaliable: The historical data for this project is available in file https://archive.ics.uci.edu/ml/datasets/Bank+Marketing
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
bank_df = pd.read_csv('bank-full.csv')
bank_df.shape
(Exploratory data quality report reflecting the following)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
bank_df.info()
bank_df.head()
bank_df.isnull().sum()
bank_df.describe(include='all').transpose()
bank_df['job'].value_counts().plot.bar()
bank_df['Target'].value_counts().plot.bar()
bank_df['marital'].value_counts().plot.bar()
bank_df['education'].value_counts().plot.bar()
bank_df['default'].value_counts().plot.bar()
bank_df['housing'].value_counts().plot.bar()
bank_df['loan'].value_counts().plot.bar()
bank_df['contact'].value_counts().plot.bar()
bank_df['month'].value_counts().plot.bar()
bank_df['poutcome'].value_counts().plot.bar()
bank_df.describe(include=[np.number]).transpose()
bank_df.hist(figsize=(25,15))
Outlier detection analysis
plt.boxplot(bank_df['age'])
plt.boxplot(bank_df['balance'])
plt.boxplot(bank_df['campaign'])
plt.boxplot(bank_df['day'])
plt.boxplot(bank_df['duration'])
As part of the EDA Analysis we can see more variations in the balance and age, duaration, days columns which is having outlier data and its has to be cleaned
sns.pairplot(bank_df)
As per the pair plot we can see there are multiple numeric columns data is more squead towards 0 and we will be applying normalization or scalling the data.
corr = bank_df.corr()
corr
#handling outlier data
Q1 = bank_df.quantile(0.25)
Q3 = bank_df.quantile(0.75)
IQR = Q3 - Q1
print(IQR)
V = 1.5 * IQR
bank_df.shape
sns.heatmap(corr,annot=True)
As pre the correlation plot we can see that pdays and previous , campaign and day features are having more correlation compare to other variables.
#bnk_data = pd.crosstab(index = (bank_df['job']), columns='count')
bnk_data = pd.crosstab(index=bank_df['job'], columns=bank_df['Target'])
bnk_data.plot.bar()
col_list = ['job','marital','education','default','housing','loan','contact','month','poutcome']
for x in col_list:
bnk_data = pd.crosstab(index=bank_df[x], columns=bank_df['Target'])
bnk_data.plot.bar()
#print(x)
(Prepare the data for analytics)
bank_df.info()
#copying data frame
df = bank_df
df.shape
df.describe(include='all').transpose()
df.info()
df.info()# Decision tree in Python can take only numerical / categorical colums. It cannot take string / obeject types.
# The following code loops through each column and checks if the column type is object then converts those columns
# into categorical with each distinct value becoming a category or code.
for feature in df.columns: # Loop through all columns in the dataframe
if df[feature].dtype == 'object': # Only apply for columns with categorical strings
df[feature] = pd.Categorical(df[feature]).codes # Replace strings with an integer
df.info()
df.loc[df['balance']<0,['balance']] = 0
from sklearn.preprocessing import minmax_scale
df['balance'] = minmax_scale(df['balance'])
df['day'] = minmax_scale(df['day'])
df['duration'] = minmax_scale(df['duration'])
df['campaign'] = minmax_scale(df['campaign'])
df['pdays'] = minmax_scale(df['pdays'])
df['previous'] = minmax_scale(df['previous'])
# plot both together to compare
#fig, ax=plt.subplots(1,2)
#sns.distplot(bank_df['balance'], ax=ax[0])
#ax[0].set_title("Original Data")
#sns.distplot(bank_df['balance_sc'], ax=ax[1])
#ax[1].set_title("Scaled data")
#plt.show()
df[df['balance']<0]
sns.pairplot(df)
df.info()
from sklearn.model_selection import train_test_split
X = df.drop('Target', axis=1)
y = df[['Target']]
# split data into train and test/validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
y_train['Target'].value_counts()
y_test['Target'].value_counts()
(create the ensemble model)
build models to predict the variable is_attributed (downloaded). We'll try the several variants of boosting (adaboost, gradient boosting and XGBoost), tune the hyperparameters in each model and choose the one which gives the best performance. In the original Kaggle competition, the metric for model evaluation is area under the ROC curve.
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(criterion = 'entropy' )
X_train.info()
dt_model.fit(X_train, y_train)
train_char_label = ['No', 'Yes']
from IPython.display import Image
#import pydotplus as pydot
from sklearn import tree
from os import system
#Bank_Tree_File = open('D:\DW\PGP_AI_ML\Ensemble\Project\bank_tree.dot','w')
#dot_data = tree.export_graphviz(dt_model, out_file=Bank_Tree_File, feature_names = list(X_train), class_names = list(train_char_label))
#Credit_Tree_File.close()
# importance of features in the tree building ( The importance of a feature is computed as the
#(normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance )
print (pd.DataFrame(dt_model.feature_importances_, columns = ["Imp"], index = X_train.columns))
#https://medium.com/@rnbrown/creating-and-visualizing-decision-trees-with-python-f8e8fa394176
#https://stackoverflow.com/questions/36979421/how-can-i-use-conda-to-install-pydotplus
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(dt_model, out_file=dot_data,
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
y_predict = dt_model.predict(X_test)
# calculate accuracy measures and confusion matrix
from sklearn import metrics
print(dt_model.score(X_train , y_train))
print(dt_model.score(X_test , y_test))
print(metrics.confusion_matrix(y_test, y_predict))
#Recall
print(777/(777+820))
#Pression
print(777/(777+857))
#As per this analysis pression and Recall looks so same.
Advantages
Disadvantages
Ensemble Learning - Bagging
from sklearn.ensemble import BaggingClassifier
bgcl = BaggingClassifier(base_estimator=dt_model, n_estimators=50)
#bgcl = BaggingClassifier(n_estimators=50)
bgcl = bgcl.fit(X_train, y_train)
y_predict = bgcl.predict(X_test)
print(bgcl.score(X_test , y_test))
print(metrics.confusion_matrix(y_test, y_predict))
#Recall
print(755/(755+842))
#Pression
print(755/(755+414))
#As per this analysis pression is better than Recall.
Ensemble Learning - AdaBoosting
from sklearn.ensemble import AdaBoostClassifier
abcl = AdaBoostClassifier(base_estimator=dt_model, n_estimators=10)
#abcl = AdaBoostClassifier( n_estimators=50)
abcl = abcl.fit(X_train, y_train)
y_predict = abcl.predict(X_test)
print(abcl.score(X_test , y_test))
print(metrics.confusion_matrix(y_test, y_predict))
#Recall
print(787/(787+810))
#Pression
print(787/(787+852))
#As per this analysis pression and Recall looks almost same.
Ensemble Learning - GradientBoost
from sklearn.ensemble import GradientBoostingClassifier
gbcl = GradientBoostingClassifier(n_estimators = 50)
gbcl = gbcl.fit(X_train, y_train)
y_predict = gbcl.predict(X_test)
print(gbcl.score(X_test , y_test))
print(metrics.confusion_matrix(y_test, y_predict))
#Recall
print(571/(571+1026))
#Pression
print(571/(571+251))
#As per this analysis pression is better than Recall.
Ensemble RandomForest Classifier
from sklearn.ensemble import RandomForestClassifier
rfcl = RandomForestClassifier(n_estimators = 50)
rfcl = rfcl.fit(X_train, y_train)
y_predict = rfcl.predict(X_test)
print(rfcl.score(X_test , y_test))
print(metrics.confusion_matrix(y_test, y_predict))
#Recall
print(690/(690+907))
#Pression
print(690/(690+327))
#As per this analysis pression is better than Recall.
(Tuning the model)
# parameter grid
param_grid = {"learning_rate": [0.2, 0.6, 0.9],
"subsample": [0.3, 0.6, 0.9]
}
GBC = GradientBoostingClassifier(max_depth=2, n_estimators=200)
from sklearn.model_selection import GridSearchCV
# run grid search
folds = 3
grid_search_GBC = GridSearchCV(GBC,
cv = folds,
param_grid=param_grid,
scoring = 'roc_auc',
return_train_score=True,
verbose = 1)
grid_search_GBC.fit(X_train, y_train)
cv_results = pd.DataFrame(grid_search_GBC.cv_results_)
cv_results.head()
# # plotting
plt.figure(figsize=(16,6))
for n, subsample in enumerate(param_grid['subsample']):
# subplot 1/n
plt.subplot(1,len(param_grid['subsample']), n+1)
df = cv_results[cv_results['param_subsample']==subsample]
plt.plot(df["param_learning_rate"], df["mean_test_score"])
plt.plot(df["param_learning_rate"], df["mean_train_score"])
plt.xlabel('learning_rate')
plt.ylabel('AUC')
plt.title("subsample={0}".format(subsample))
plt.ylim([0.60, 1])
plt.legend(['test score', 'train score'], loc='upper left')
plt.xscale('log')
The results show that a subsample size of 0.9 and learning_rate of about 0.2 seems optimal. Also, GradientBoosting and Random Forest has resulted in the highest ROC AUC obtained (across various hyperparameters). Let's build a final model with the chosen hyperparameters.
Regularising the Decision Tree
reg_dt_model = DecisionTreeClassifier(criterion = 'entropy', max_depth = 7)
reg_dt_model.fit(X_train, y_train)
print (pd.DataFrame(dt_model.feature_importances_, columns = ["Imp"], index = X_train.columns))
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(reg_dt_model, out_file=dot_data,
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
y_predict = reg_dt_model.predict(X_test)
reg_dt_model.score(X_test , y_test)
print(metrics.confusion_matrix(y_test, y_predict))
#Recall
print(564/(564+1033))
#Pression
print(564/(564+300))
#As per this analysis pression is better than Recall.